{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.simplefilter('ignore')\n",
    "import gc\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "pd.set_option('max_columns', None)\n",
    "pd.set_option('max_rows', None)\n",
    "\n",
    "from tqdm import tqdm\n",
    "tqdm.pandas()\n",
    "\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import StratifiedKFold, KFold\n",
    "from sklearn.metrics import auc, accuracy_score\n",
    "\n",
    "import lightgbm as lgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('raw_data/train.csv')\n",
    "test = pd.read_csv('raw_data/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in ['benefits', 'company_profile', 'department', 'description',\n",
    "            'employment_type', 'function', 'industry', 'location', 'required_education',\n",
    "            'required_experience', 'requirements', 'title']:\n",
    "    train[col] = train[col].str.lower()\n",
    "    test[col] = test[col].str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process(x):\n",
    "    if x == 'nan':\n",
    "        return 0\n",
    "    else:\n",
    "        return len(x.split())\n",
    "\n",
    "\n",
    "for col in ['benefits', 'title', 'company_profile', 'description', 'requirements']:\n",
    "    train[f'{col}_wordsLen'] = train[col].astype('str').apply(lambda x: process(x))\n",
    "    test[f'{col}_wordsLen'] = test[col].astype('str').apply(lambda x: process(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process1(x):\n",
    "    if x == 'nan':\n",
    "        return -999\n",
    "    else:\n",
    "        try:\n",
    "            return int(x.split('-')[0])\n",
    "        except:\n",
    "            return -998\n",
    "    \n",
    "def process2(x):\n",
    "    if x == 'nan':\n",
    "        return -999\n",
    "    else:\n",
    "        try:\n",
    "            return int(x.split('-')[1])\n",
    "        except:\n",
    "            return -998\n",
    "    \n",
    "\n",
    "train['salary_range_start'] = train['salary_range'].astype('str').apply(lambda x: process1(x))\n",
    "test['salary_range_start'] = test['salary_range'].astype('str').apply(lambda x: process1(x))\n",
    "\n",
    "train['salary_range_end'] = train['salary_range'].astype('str').apply(lambda x: process2(x))\n",
    "test['salary_range_end'] = test['salary_range'].astype('str').apply(lambda x: process2(x))\n",
    "\n",
    "del train['salary_range']\n",
    "del test['salary_range']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 8/8 [00:00<00:00, 98.85it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "34"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.concat([train, test])\n",
    "del train, test\n",
    "\n",
    "for f in tqdm(['department', 'employment_type', 'function', 'industry',\n",
    "               'location', 'required_education', 'required_experience', 'title']):\n",
    "    lbl = LabelEncoder()\n",
    "    df[f] = lbl.fit_transform(df[f].astype(str))\n",
    "\n",
    "train = df[df['fraudulent'].notnull()].copy()\n",
    "test = df[df['fraudulent'].isnull()].copy()\n",
    "\n",
    "del df\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_tfidf(train, test, colname, max_features):\n",
    "\n",
    "    text = list(train[colname].fillna('nan').values)\n",
    "    tf = TfidfVectorizer(min_df=0, \n",
    "                         ngram_range=(1,2), \n",
    "                         stop_words='english', \n",
    "                         max_features=max_features)\n",
    "    tf.fit(text)\n",
    "    X = tf.transform(text)\n",
    "    X_test = tf.transform(list(test[colname].fillna('nan').values))\n",
    "\n",
    "    df_tfidf = pd.DataFrame(X.todense())\n",
    "    df_tfidf_test = pd.DataFrame(X_test.todense())\n",
    "    df_tfidf.columns = [f'{colname}_tfidf{i}' for i in range(max_features)]\n",
    "    df_tfidf_test.columns = [f'{colname}_tfidf{i}' for i in range(max_features)]\n",
    "    for col in df_tfidf.columns:\n",
    "        train[col] = df_tfidf[col]\n",
    "        test[col] = df_tfidf_test[col]\n",
    "        \n",
    "    return train, test\n",
    "\n",
    "\n",
    "train, test = get_tfidf(train, test, 'benefits', 12)\n",
    "train, test = get_tfidf(train, test, 'company_profile', 24)\n",
    "train, test = get_tfidf(train, test, 'description', 48)\n",
    "train, test = get_tfidf(train, test, 'requirements', 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "to_drop = ['benefits', 'company_profile', 'description', 'requirements']\n",
    "\n",
    "train = train.drop(to_drop, axis=1)\n",
    "test = test.drop(to_drop, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((17680, 124), (200, 124))"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['id'] = train.index\n",
    "test['id'] = test.index\n",
    "\n",
    "train.shape, test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>benefits_wordsLen</th>\n",
       "      <th>company_profile_wordsLen</th>\n",
       "      <th>department</th>\n",
       "      <th>description_wordsLen</th>\n",
       "      <th>employment_type</th>\n",
       "      <th>fraudulent</th>\n",
       "      <th>function</th>\n",
       "      <th>has_company_logo</th>\n",
       "      <th>has_questions</th>\n",
       "      <th>industry</th>\n",
       "      <th>location</th>\n",
       "      <th>required_education</th>\n",
       "      <th>required_experience</th>\n",
       "      <th>requirements_wordsLen</th>\n",
       "      <th>salary_range_end</th>\n",
       "      <th>salary_range_start</th>\n",
       "      <th>telecommuting</th>\n",
       "      <th>title</th>\n",
       "      <th>title_wordsLen</th>\n",
       "      <th>benefits_tfidf0</th>\n",
       "      <th>benefits_tfidf1</th>\n",
       "      <th>benefits_tfidf2</th>\n",
       "      <th>benefits_tfidf3</th>\n",
       "      <th>benefits_tfidf4</th>\n",
       "      <th>benefits_tfidf5</th>\n",
       "      <th>benefits_tfidf6</th>\n",
       "      <th>benefits_tfidf7</th>\n",
       "      <th>benefits_tfidf8</th>\n",
       "      <th>benefits_tfidf9</th>\n",
       "      <th>benefits_tfidf10</th>\n",
       "      <th>benefits_tfidf11</th>\n",
       "      <th>company_profile_tfidf0</th>\n",
       "      <th>company_profile_tfidf1</th>\n",
       "      <th>company_profile_tfidf2</th>\n",
       "      <th>company_profile_tfidf3</th>\n",
       "      <th>company_profile_tfidf4</th>\n",
       "      <th>company_profile_tfidf5</th>\n",
       "      <th>company_profile_tfidf6</th>\n",
       "      <th>company_profile_tfidf7</th>\n",
       "      <th>company_profile_tfidf8</th>\n",
       "      <th>company_profile_tfidf9</th>\n",
       "      <th>company_profile_tfidf10</th>\n",
       "      <th>company_profile_tfidf11</th>\n",
       "      <th>company_profile_tfidf12</th>\n",
       "      <th>company_profile_tfidf13</th>\n",
       "      <th>company_profile_tfidf14</th>\n",
       "      <th>company_profile_tfidf15</th>\n",
       "      <th>company_profile_tfidf16</th>\n",
       "      <th>company_profile_tfidf17</th>\n",
       "      <th>company_profile_tfidf18</th>\n",
       "      <th>company_profile_tfidf19</th>\n",
       "      <th>company_profile_tfidf20</th>\n",
       "      <th>company_profile_tfidf21</th>\n",
       "      <th>company_profile_tfidf22</th>\n",
       "      <th>company_profile_tfidf23</th>\n",
       "      <th>description_tfidf0</th>\n",
       "      <th>description_tfidf1</th>\n",
       "      <th>description_tfidf2</th>\n",
       "      <th>description_tfidf3</th>\n",
       "      <th>description_tfidf4</th>\n",
       "      <th>description_tfidf5</th>\n",
       "      <th>description_tfidf6</th>\n",
       "      <th>description_tfidf7</th>\n",
       "      <th>description_tfidf8</th>\n",
       "      <th>description_tfidf9</th>\n",
       "      <th>description_tfidf10</th>\n",
       "      <th>description_tfidf11</th>\n",
       "      <th>description_tfidf12</th>\n",
       "      <th>description_tfidf13</th>\n",
       "      <th>description_tfidf14</th>\n",
       "      <th>description_tfidf15</th>\n",
       "      <th>description_tfidf16</th>\n",
       "      <th>description_tfidf17</th>\n",
       "      <th>description_tfidf18</th>\n",
       "      <th>description_tfidf19</th>\n",
       "      <th>description_tfidf20</th>\n",
       "      <th>description_tfidf21</th>\n",
       "      <th>description_tfidf22</th>\n",
       "      <th>description_tfidf23</th>\n",
       "      <th>description_tfidf24</th>\n",
       "      <th>description_tfidf25</th>\n",
       "      <th>description_tfidf26</th>\n",
       "      <th>description_tfidf27</th>\n",
       "      <th>description_tfidf28</th>\n",
       "      <th>description_tfidf29</th>\n",
       "      <th>description_tfidf30</th>\n",
       "      <th>description_tfidf31</th>\n",
       "      <th>description_tfidf32</th>\n",
       "      <th>description_tfidf33</th>\n",
       "      <th>description_tfidf34</th>\n",
       "      <th>description_tfidf35</th>\n",
       "      <th>description_tfidf36</th>\n",
       "      <th>description_tfidf37</th>\n",
       "      <th>description_tfidf38</th>\n",
       "      <th>description_tfidf39</th>\n",
       "      <th>description_tfidf40</th>\n",
       "      <th>description_tfidf41</th>\n",
       "      <th>description_tfidf42</th>\n",
       "      <th>description_tfidf43</th>\n",
       "      <th>description_tfidf44</th>\n",
       "      <th>description_tfidf45</th>\n",
       "      <th>description_tfidf46</th>\n",
       "      <th>description_tfidf47</th>\n",
       "      <th>requirements_tfidf0</th>\n",
       "      <th>requirements_tfidf1</th>\n",
       "      <th>requirements_tfidf2</th>\n",
       "      <th>requirements_tfidf3</th>\n",
       "      <th>requirements_tfidf4</th>\n",
       "      <th>requirements_tfidf5</th>\n",
       "      <th>requirements_tfidf6</th>\n",
       "      <th>requirements_tfidf7</th>\n",
       "      <th>requirements_tfidf8</th>\n",
       "      <th>requirements_tfidf9</th>\n",
       "      <th>requirements_tfidf10</th>\n",
       "      <th>requirements_tfidf11</th>\n",
       "      <th>requirements_tfidf12</th>\n",
       "      <th>requirements_tfidf13</th>\n",
       "      <th>requirements_tfidf14</th>\n",
       "      <th>requirements_tfidf15</th>\n",
       "      <th>requirements_tfidf16</th>\n",
       "      <th>requirements_tfidf17</th>\n",
       "      <th>requirements_tfidf18</th>\n",
       "      <th>requirements_tfidf19</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>82</td>\n",
       "      <td>114</td>\n",
       "      <td>49</td>\n",
       "      <td>104</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>23</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>86</td>\n",
       "      <td>788</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>72</td>\n",
       "      <td>-999</td>\n",
       "      <td>-999</td>\n",
       "      <td>0</td>\n",
       "      <td>6520</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.467817</td>\n",
       "      <td>0.517759</td>\n",
       "      <td>0.486184</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.526022</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.201000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.395883</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.7612</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.359037</td>\n",
       "      <td>0.232873</td>\n",
       "      <td>0.200779</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.261991</td>\n",
       "      <td>0.292880</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.245878</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.364591</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.260746</td>\n",
       "      <td>0.325666</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.303185</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.312115</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.326553</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.310509</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.292853</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.744641</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.667466</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>79</td>\n",
       "      <td>431</td>\n",
       "      <td>165</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>22</td>\n",
       "      <td>1403</td>\n",
       "      <td>6</td>\n",
       "      <td>5</td>\n",
       "      <td>111</td>\n",
       "      <td>-999</td>\n",
       "      <td>-999</td>\n",
       "      <td>0</td>\n",
       "      <td>3679</td>\n",
       "      <td>3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.656718</td>\n",
       "      <td>0.754137</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.529610</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.472876</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.467282</td>\n",
       "      <td>0.526830</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.116965</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.133494</td>\n",
       "      <td>0.387710</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.442036</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.396730</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.138828</td>\n",
       "      <td>0.134289</td>\n",
       "      <td>0.099286</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.247637</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.126252</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.545214</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.156471</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.111512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.082373</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.501621</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.166066</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.175545</td>\n",
       "      <td>0.228821</td>\n",
       "      <td>0.423701</td>\n",
       "      <td>0.208787</td>\n",
       "      <td>0.217190</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.180581</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.278790</td>\n",
       "      <td>0.193091</td>\n",
       "      <td>0.185068</td>\n",
       "      <td>0.300812</td>\n",
       "      <td>0.189869</td>\n",
       "      <td>0.190502</td>\n",
       "      <td>0.150466</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>808</td>\n",
       "      <td>201</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>23</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>86</td>\n",
       "      <td>1979</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>38</td>\n",
       "      <td>130000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9367</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.139291</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.153318</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.461716</td>\n",
       "      <td>0.159210</td>\n",
       "      <td>0.131603</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.236476</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.294905</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.149121</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.162321</td>\n",
       "      <td>0.328658</td>\n",
       "      <td>0.281607</td>\n",
       "      <td>0.186338</td>\n",
       "      <td>0.341494</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.337796</td>\n",
       "      <td>0.196193</td>\n",
       "      <td>0.136055</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.429303</td>\n",
       "      <td>0.446580</td>\n",
       "      <td>0.721489</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.309384</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>110</td>\n",
       "      <td>182</td>\n",
       "      <td>808</td>\n",
       "      <td>243</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>25</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>22</td>\n",
       "      <td>1445</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>255</td>\n",
       "      <td>-999</td>\n",
       "      <td>-999</td>\n",
       "      <td>0</td>\n",
       "      <td>7342</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.394999</td>\n",
       "      <td>0.638212</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.240403</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.207688</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.376103</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.196463</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.394565</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.210487</td>\n",
       "      <td>0.088168</td>\n",
       "      <td>0.088475</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.152748</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.354022</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.075680</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.135847</td>\n",
       "      <td>0.086617</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.085192</td>\n",
       "      <td>0.083578</td>\n",
       "      <td>0.064117</td>\n",
       "      <td>0.07755</td>\n",
       "      <td>0.190148</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.135989</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.678358</td>\n",
       "      <td>0.184414</td>\n",
       "      <td>0.100236</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.081390</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.078808</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.377999</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.107156</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.056412</td>\n",
       "      <td>0.078241</td>\n",
       "      <td>0.292460</td>\n",
       "      <td>0.183591</td>\n",
       "      <td>0.145233</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.307044</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.185273</td>\n",
       "      <td>0.365188</td>\n",
       "      <td>0.189943</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.510770</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.365723</td>\n",
       "      <td>0.168867</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.263074</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.263178</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>42</td>\n",
       "      <td>0</td>\n",
       "      <td>1029</td>\n",
       "      <td>134</td>\n",
       "      <td>4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>32</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>72</td>\n",
       "      <td>2413</td>\n",
       "      <td>8</td>\n",
       "      <td>4</td>\n",
       "      <td>26</td>\n",
       "      <td>-999</td>\n",
       "      <td>-999</td>\n",
       "      <td>0</td>\n",
       "      <td>8243</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.096060</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.209130</td>\n",
       "      <td>0.120088</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.125351</td>\n",
       "      <td>0.103615</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.279276</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.222864</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.11483</td>\n",
       "      <td>0.835844</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.110859</td>\n",
       "      <td>0.220064</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.077234</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.405656</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.263459</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.438102</td>\n",
       "      <td>0.571475</td>\n",
       "      <td>0.338182</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.364896</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   benefits_wordsLen  company_profile_wordsLen  department  \\\n",
       "0                 82                       114          49   \n",
       "1                 10                        79         431   \n",
       "2                  0                         0         808   \n",
       "3                110                       182         808   \n",
       "4                 42                         0        1029   \n",
       "\n",
       "   description_wordsLen  employment_type  fraudulent  function  \\\n",
       "0                   104                2         0.0        23   \n",
       "1                   165                1         0.0        12   \n",
       "2                   201                1         0.0        23   \n",
       "3                   243                1         0.0        25   \n",
       "4                   134                4         0.0        32   \n",
       "\n",
       "   has_company_logo  has_questions  industry  location  required_education  \\\n",
       "0                 1              1        86       788                   6   \n",
       "1                 1              1        22      1403                   6   \n",
       "2                 0              0        86      1979                   1   \n",
       "3                 1              0        22      1445                   1   \n",
       "4                 0              1        72      2413                   8   \n",
       "\n",
       "   required_experience  requirements_wordsLen  salary_range_end  \\\n",
       "0                    6                     72              -999   \n",
       "1                    5                    111              -999   \n",
       "2                    6                     38            130000   \n",
       "3                    0                    255              -999   \n",
       "4                    4                     26              -999   \n",
       "\n",
       "   salary_range_start  telecommuting  title  title_wordsLen  benefits_tfidf0  \\\n",
       "0                -999              0   6520               2              0.0   \n",
       "1                -999              0   3679               3              0.0   \n",
       "2                   0              0   9367               2              0.0   \n",
       "3                -999              0   7342               2              0.0   \n",
       "4                -999              0   8243               2              0.0   \n",
       "\n",
       "   benefits_tfidf1  benefits_tfidf2  benefits_tfidf3  benefits_tfidf4  \\\n",
       "0         0.467817         0.517759         0.486184         0.000000   \n",
       "1         0.000000         0.000000         0.656718         0.754137   \n",
       "2         0.000000         0.000000         0.000000         0.000000   \n",
       "3         0.000000         0.000000         0.000000         0.000000   \n",
       "4         0.000000         0.000000         0.000000         0.000000   \n",
       "\n",
       "   benefits_tfidf5  benefits_tfidf6  benefits_tfidf7  benefits_tfidf8  \\\n",
       "0              0.0              0.0              0.0         0.526022   \n",
       "1              0.0              0.0              0.0         0.000000   \n",
       "2              1.0              0.0              0.0         0.000000   \n",
       "3              0.0              0.0              0.0         0.000000   \n",
       "4              0.0              0.0              0.0         0.000000   \n",
       "\n",
       "   benefits_tfidf9  benefits_tfidf10  benefits_tfidf11  \\\n",
       "0              0.0               0.0               0.0   \n",
       "1              0.0               0.0               0.0   \n",
       "2              0.0               0.0               0.0   \n",
       "3              0.0               0.0               0.0   \n",
       "4              1.0               0.0               0.0   \n",
       "\n",
       "   company_profile_tfidf0  company_profile_tfidf1  company_profile_tfidf2  \\\n",
       "0                     0.0                     0.0                0.201000   \n",
       "1                     0.0                     0.0                0.529610   \n",
       "2                     0.0                     0.0                0.000000   \n",
       "3                     0.0                     0.0                0.394999   \n",
       "4                     0.0                     0.0                0.000000   \n",
       "\n",
       "   company_profile_tfidf3  company_profile_tfidf4  company_profile_tfidf5  \\\n",
       "0                0.000000                0.395883                0.000000   \n",
       "1                0.000000                0.000000                0.472876   \n",
       "2                0.000000                0.000000                0.000000   \n",
       "3                0.638212                0.000000                0.000000   \n",
       "4                0.000000                0.000000                0.000000   \n",
       "\n",
       "   company_profile_tfidf6  company_profile_tfidf7  company_profile_tfidf8  \\\n",
       "0                     0.0                     0.0                     0.0   \n",
       "1                     0.0                     0.0                     0.0   \n",
       "2                     0.0                     0.0                     0.0   \n",
       "3                     0.0                     0.0                     0.0   \n",
       "4                     0.0                     0.0                     0.0   \n",
       "\n",
       "   company_profile_tfidf9  company_profile_tfidf10  company_profile_tfidf11  \\\n",
       "0                     0.0                 0.000000                      0.0   \n",
       "1                     0.0                 0.000000                      0.0   \n",
       "2                     0.0                 0.000000                      1.0   \n",
       "3                     0.0                 0.240403                      0.0   \n",
       "4                     0.0                 0.000000                      1.0   \n",
       "\n",
       "   company_profile_tfidf12  company_profile_tfidf13  company_profile_tfidf14  \\\n",
       "0                 0.000000                   0.7612                      0.0   \n",
       "1                 0.000000                   0.0000                      0.0   \n",
       "2                 0.000000                   0.0000                      0.0   \n",
       "3                 0.207688                   0.0000                      0.0   \n",
       "4                 0.000000                   0.0000                      0.0   \n",
       "\n",
       "   company_profile_tfidf15  company_profile_tfidf16  company_profile_tfidf17  \\\n",
       "0                      0.0                 0.000000                      0.0   \n",
       "1                      0.0                 0.000000                      0.0   \n",
       "2                      0.0                 0.000000                      0.0   \n",
       "3                      0.0                 0.376103                      0.0   \n",
       "4                      0.0                 0.000000                      0.0   \n",
       "\n",
       "   company_profile_tfidf18  company_profile_tfidf19  company_profile_tfidf20  \\\n",
       "0                 0.000000                 0.000000                      0.0   \n",
       "1                 0.467282                 0.526830                      0.0   \n",
       "2                 0.000000                 0.000000                      0.0   \n",
       "3                 0.000000                 0.196463                      0.0   \n",
       "4                 0.000000                 0.000000                      0.0   \n",
       "\n",
       "   company_profile_tfidf21  company_profile_tfidf22  company_profile_tfidf23  \\\n",
       "0                 0.359037                 0.232873                 0.200779   \n",
       "1                 0.000000                 0.000000                 0.000000   \n",
       "2                 0.000000                 0.000000                 0.000000   \n",
       "3                 0.000000                 0.000000                 0.394565   \n",
       "4                 0.000000                 0.000000                 0.000000   \n",
       "\n",
       "   description_tfidf0  description_tfidf1  description_tfidf2  \\\n",
       "0                 0.0            0.000000            0.000000   \n",
       "1                 0.0            0.116965            0.000000   \n",
       "2                 0.0            0.139291            0.000000   \n",
       "3                 0.0            0.000000            0.210487   \n",
       "4                 0.0            0.000000            0.096060   \n",
       "\n",
       "   description_tfidf3  description_tfidf4  description_tfidf5  \\\n",
       "0            0.000000            0.000000            0.261991   \n",
       "1            0.000000            0.000000            0.000000   \n",
       "2            0.153318            0.000000            0.000000   \n",
       "3            0.088168            0.088475            0.000000   \n",
       "4            0.000000            0.000000            0.000000   \n",
       "\n",
       "   description_tfidf6  description_tfidf7  description_tfidf8  \\\n",
       "0            0.292880            0.000000            0.000000   \n",
       "1            0.000000            0.000000            0.133494   \n",
       "2            0.000000            0.000000            0.000000   \n",
       "3            0.152748            0.000000            0.000000   \n",
       "4            0.209130            0.120088            0.000000   \n",
       "\n",
       "   description_tfidf9  description_tfidf10  description_tfidf11  \\\n",
       "0            0.000000             0.000000             0.000000   \n",
       "1            0.387710             0.000000             0.442036   \n",
       "2            0.461716             0.159210             0.131603   \n",
       "3            0.354022             0.000000             0.075680   \n",
       "4            0.000000             0.125351             0.103615   \n",
       "\n",
       "   description_tfidf12  description_tfidf13  description_tfidf14  \\\n",
       "0                  0.0             0.000000             0.000000   \n",
       "1                  0.0             0.396730             0.000000   \n",
       "2                  0.0             0.000000             0.000000   \n",
       "3                  0.0             0.135847             0.086617   \n",
       "4                  0.0             0.000000             0.000000   \n",
       "\n",
       "   description_tfidf15  description_tfidf16  description_tfidf17  \\\n",
       "0                  0.0             0.000000             0.000000   \n",
       "1                  0.0             0.000000             0.000000   \n",
       "2                  0.0             0.000000             0.000000   \n",
       "3                  0.0             0.085192             0.083578   \n",
       "4                  0.0             0.000000             0.000000   \n",
       "\n",
       "   description_tfidf18  description_tfidf19  description_tfidf20  \\\n",
       "0             0.245878              0.00000             0.364591   \n",
       "1             0.000000              0.00000             0.138828   \n",
       "2             0.000000              0.00000             0.000000   \n",
       "3             0.064117              0.07755             0.190148   \n",
       "4             0.000000              0.00000             0.000000   \n",
       "\n",
       "   description_tfidf21  description_tfidf22  description_tfidf23  \\\n",
       "0             0.000000             0.260746             0.325666   \n",
       "1             0.134289             0.099286             0.000000   \n",
       "2             0.000000             0.236476             0.000000   \n",
       "3             0.000000             0.135989             0.000000   \n",
       "4             0.000000             0.279276             0.000000   \n",
       "\n",
       "   description_tfidf24  description_tfidf25  description_tfidf26  \\\n",
       "0                  0.0             0.303185             0.000000   \n",
       "1                  0.0             0.000000             0.247637   \n",
       "2                  0.0             0.000000             0.294905   \n",
       "3                  0.0             0.000000             0.678358   \n",
       "4                  0.0             0.000000             0.000000   \n",
       "\n",
       "   description_tfidf27  description_tfidf28  description_tfidf29  \\\n",
       "0             0.000000             0.000000             0.000000   \n",
       "1             0.000000             0.000000             0.000000   \n",
       "2             0.000000             0.000000             0.149121   \n",
       "3             0.184414             0.100236             0.000000   \n",
       "4             0.000000             0.000000             0.000000   \n",
       "\n",
       "   description_tfidf30  description_tfidf31  description_tfidf32  \\\n",
       "0                  0.0                  0.0             0.312115   \n",
       "1                  0.0                  0.0             0.000000   \n",
       "2                  0.0                  0.0             0.000000   \n",
       "3                  0.0                  0.0             0.081390   \n",
       "4                  0.0                  0.0             0.222864   \n",
       "\n",
       "   description_tfidf33  description_tfidf34  description_tfidf35  \\\n",
       "0             0.000000              0.00000             0.000000   \n",
       "1             0.126252              0.00000             0.000000   \n",
       "2             0.000000              0.00000             0.000000   \n",
       "3             0.000000              0.00000             0.000000   \n",
       "4             0.000000              0.11483             0.835844   \n",
       "\n",
       "   description_tfidf36  description_tfidf37  description_tfidf38  \\\n",
       "0             0.000000             0.326553                  0.0   \n",
       "1             0.000000             0.000000                  0.0   \n",
       "2             0.000000             0.000000                  0.0   \n",
       "3             0.078808             0.000000                  0.0   \n",
       "4             0.000000             0.000000                  0.0   \n",
       "\n",
       "   description_tfidf39  description_tfidf40  description_tfidf41  \\\n",
       "0             0.000000             0.000000             0.310509   \n",
       "1             0.545214             0.000000             0.000000   \n",
       "2             0.162321             0.328658             0.281607   \n",
       "3             0.000000             0.377999             0.000000   \n",
       "4             0.000000             0.000000             0.110859   \n",
       "\n",
       "   description_tfidf42  description_tfidf43  description_tfidf44  \\\n",
       "0             0.000000             0.000000             0.292853   \n",
       "1             0.156471             0.000000             0.111512   \n",
       "2             0.186338             0.341494             0.000000   \n",
       "3             0.107156             0.000000             0.000000   \n",
       "4             0.220064             0.000000             0.000000   \n",
       "\n",
       "   description_tfidf45  description_tfidf46  description_tfidf47  \\\n",
       "0             0.000000             0.000000             0.000000   \n",
       "1             0.000000             0.082373             0.000000   \n",
       "2             0.337796             0.196193             0.136055   \n",
       "3             0.000000             0.056412             0.078241   \n",
       "4             0.000000             0.077234             0.000000   \n",
       "\n",
       "   requirements_tfidf0  requirements_tfidf1  requirements_tfidf2  \\\n",
       "0             0.000000             0.000000             0.000000   \n",
       "1             0.501621             0.000000             0.166066   \n",
       "2             0.000000             0.000000             0.000000   \n",
       "3             0.292460             0.183591             0.145233   \n",
       "4             0.405656             0.000000             0.000000   \n",
       "\n",
       "   requirements_tfidf3  requirements_tfidf4  requirements_tfidf5  \\\n",
       "0                  0.0             0.000000             0.000000   \n",
       "1                  0.0             0.175545             0.228821   \n",
       "2                  0.0             0.000000             0.000000   \n",
       "3                  0.0             0.307044             0.000000   \n",
       "4                  0.0             0.000000             0.000000   \n",
       "\n",
       "   requirements_tfidf6  requirements_tfidf7  requirements_tfidf8  \\\n",
       "0             0.000000             0.000000             0.744641   \n",
       "1             0.423701             0.208787             0.217190   \n",
       "2             0.000000             0.429303             0.446580   \n",
       "3             0.185273             0.365188             0.189943   \n",
       "4             0.000000             0.000000             0.263459   \n",
       "\n",
       "   requirements_tfidf9  requirements_tfidf10  requirements_tfidf11  \\\n",
       "0             0.000000              0.667466              0.000000   \n",
       "1             0.000000              0.000000              0.180581   \n",
       "2             0.721489              0.000000              0.000000   \n",
       "3             0.000000              0.510770              0.000000   \n",
       "4             0.000000              0.000000              0.438102   \n",
       "\n",
       "   requirements_tfidf12  requirements_tfidf13  requirements_tfidf14  \\\n",
       "0              0.000000              0.000000              0.000000   \n",
       "1              0.000000              0.278790              0.193091   \n",
       "2              0.000000              0.000000              0.000000   \n",
       "3              0.000000              0.365723              0.168867   \n",
       "4              0.571475              0.338182              0.000000   \n",
       "\n",
       "   requirements_tfidf15  requirements_tfidf16  requirements_tfidf17  \\\n",
       "0              0.000000              0.000000              0.000000   \n",
       "1              0.185068              0.300812              0.189869   \n",
       "2              0.000000              0.000000              0.000000   \n",
       "3              0.000000              0.263074              0.000000   \n",
       "4              0.000000              0.364896              0.000000   \n",
       "\n",
       "   requirements_tfidf18  requirements_tfidf19  id  \n",
       "0              0.000000              0.000000   0  \n",
       "1              0.190502              0.150466   1  \n",
       "2              0.000000              0.309384   2  \n",
       "3              0.000000              0.263178   3  \n",
       "4              0.000000              0.000000   4  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Fold_1 Training ================================\n",
      "\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "Early stopping, best iteration is:\n",
      "[228]\ttrain's auc: 0.999957\tvalid's auc: 0.989111\n",
      "\n",
      "Fold_2 Training ================================\n",
      "\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "Early stopping, best iteration is:\n",
      "[210]\ttrain's auc: 0.999951\tvalid's auc: 0.986598\n",
      "\n",
      "Fold_3 Training ================================\n",
      "\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "Early stopping, best iteration is:\n",
      "[220]\ttrain's auc: 0.999971\tvalid's auc: 0.989948\n",
      "\n",
      "Fold_4 Training ================================\n",
      "\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "Early stopping, best iteration is:\n",
      "[205]\ttrain's auc: 0.999956\tvalid's auc: 0.990655\n",
      "\n",
      "Fold_5 Training ================================\n",
      "\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "Early stopping, best iteration is:\n",
      "[221]\ttrain's auc: 0.999976\tvalid's auc: 0.988312\n"
     ]
    }
   ],
   "source": [
    "ycol = 'fraudulent'\n",
    "feature_names = list(\n",
    "    filter(lambda x: x not in [ycol, 'id'], train.columns))\n",
    "\n",
    "model = lgb.LGBMClassifier(objective='binary',\n",
    "                           boosting_type='gbdt',\n",
    "                           tree_learner='serial',\n",
    "                           num_leaves=32,\n",
    "                           max_depth=6,\n",
    "                           learning_rate=0.1,\n",
    "                           n_estimators=10000,\n",
    "                           subsample=0.8,\n",
    "                           feature_fraction=0.6,\n",
    "                           reg_alpha=10,\n",
    "                           reg_lambda=12,\n",
    "                           random_state=1983,\n",
    "                           is_unbalance=True,\n",
    "                           metric='auc')\n",
    "\n",
    "\n",
    "oof = []\n",
    "prediction = test[['id']]\n",
    "prediction['fraudulent'] = 0\n",
    "df_importance_list = []\n",
    "\n",
    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1983)\n",
    "for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):\n",
    "    X_train = train.iloc[trn_idx][feature_names]\n",
    "    Y_train = train.iloc[trn_idx][ycol]\n",
    "\n",
    "    X_val = train.iloc[val_idx][feature_names]\n",
    "    Y_val = train.iloc[val_idx][ycol]\n",
    "\n",
    "    print('\\nFold_{} Training ================================\\n'.format(fold_id+1))\n",
    "\n",
    "    lgb_model = model.fit(X_train,\n",
    "                          Y_train,\n",
    "                          eval_names=['train', 'valid'],\n",
    "                          eval_set=[(X_train, Y_train), (X_val, Y_val)],\n",
    "                          verbose=500,\n",
    "                          eval_metric='auc',\n",
    "                          early_stopping_rounds=50)\n",
    "\n",
    "    pred_val = lgb_model.predict(\n",
    "        X_val, num_iteration=lgb_model.best_iteration_)\n",
    "    df_oof = train.iloc[val_idx][['id', ycol]].copy()\n",
    "    df_oof['pred'] = pred_val\n",
    "    oof.append(df_oof)\n",
    "\n",
    "    pred_test = lgb_model.predict(\n",
    "        test[feature_names], num_iteration=lgb_model.best_iteration_)\n",
    "    prediction['fraudulent'] += pred_test / kfold.n_splits\n",
    "\n",
    "    df_importance = pd.DataFrame({\n",
    "        'column': feature_names,\n",
    "        'importance': lgb_model.feature_importances_,\n",
    "    })\n",
    "    df_importance_list.append(df_importance)\n",
    "\n",
    "    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val\n",
    "    gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>column</th>\n",
       "      <th>importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>location</td>\n",
       "      <td>379.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>industry</td>\n",
       "      <td>332.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>description_wordsLen</td>\n",
       "      <td>300.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>title</td>\n",
       "      <td>267.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>company_profile_wordsLen</td>\n",
       "      <td>186.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>requirements_wordsLen</td>\n",
       "      <td>184.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>department</td>\n",
       "      <td>165.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>function</td>\n",
       "      <td>144.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>benefits_wordsLen</td>\n",
       "      <td>140.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>description_tfidf46</td>\n",
       "      <td>91.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>description_tfidf5</td>\n",
       "      <td>90.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11</td>\n",
       "      <td>description_tfidf18</td>\n",
       "      <td>89.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>12</td>\n",
       "      <td>requirements_tfidf8</td>\n",
       "      <td>85.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>13</td>\n",
       "      <td>required_experience</td>\n",
       "      <td>84.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14</td>\n",
       "      <td>requirements_tfidf16</td>\n",
       "      <td>74.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>15</td>\n",
       "      <td>company_profile_tfidf7</td>\n",
       "      <td>74.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>16</td>\n",
       "      <td>description_tfidf13</td>\n",
       "      <td>74.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>17</td>\n",
       "      <td>description_tfidf42</td>\n",
       "      <td>73.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>18</td>\n",
       "      <td>description_tfidf17</td>\n",
       "      <td>71.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>19</td>\n",
       "      <td>benefits_tfidf1</td>\n",
       "      <td>70.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>20</td>\n",
       "      <td>salary_range_start</td>\n",
       "      <td>70.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>21</td>\n",
       "      <td>company_profile_tfidf16</td>\n",
       "      <td>70.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>22</td>\n",
       "      <td>required_education</td>\n",
       "      <td>68.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>23</td>\n",
       "      <td>company_profile_tfidf9</td>\n",
       "      <td>67.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>24</td>\n",
       "      <td>title_wordsLen</td>\n",
       "      <td>65.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>25</td>\n",
       "      <td>description_tfidf23</td>\n",
       "      <td>61.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>26</td>\n",
       "      <td>requirements_tfidf13</td>\n",
       "      <td>61.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>27</td>\n",
       "      <td>requirements_tfidf19</td>\n",
       "      <td>59.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>28</td>\n",
       "      <td>description_tfidf44</td>\n",
       "      <td>58.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>29</td>\n",
       "      <td>description_tfidf47</td>\n",
       "      <td>53.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>30</td>\n",
       "      <td>salary_range_end</td>\n",
       "      <td>52.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>31</td>\n",
       "      <td>requirements_tfidf2</td>\n",
       "      <td>48.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>32</td>\n",
       "      <td>description_tfidf24</td>\n",
       "      <td>48.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>33</td>\n",
       "      <td>description_tfidf0</td>\n",
       "      <td>48.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>34</td>\n",
       "      <td>description_tfidf25</td>\n",
       "      <td>46.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>35</td>\n",
       "      <td>description_tfidf36</td>\n",
       "      <td>45.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>36</td>\n",
       "      <td>employment_type</td>\n",
       "      <td>45.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>37</td>\n",
       "      <td>description_tfidf22</td>\n",
       "      <td>43.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>38</td>\n",
       "      <td>description_tfidf37</td>\n",
       "      <td>40.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>39</td>\n",
       "      <td>company_profile_tfidf2</td>\n",
       "      <td>40.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>40</td>\n",
       "      <td>description_tfidf38</td>\n",
       "      <td>40.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>41</td>\n",
       "      <td>description_tfidf1</td>\n",
       "      <td>40.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>42</td>\n",
       "      <td>company_profile_tfidf12</td>\n",
       "      <td>38.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>43</td>\n",
       "      <td>has_company_logo</td>\n",
       "      <td>37.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>44</td>\n",
       "      <td>benefits_tfidf2</td>\n",
       "      <td>36.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>45</td>\n",
       "      <td>company_profile_tfidf3</td>\n",
       "      <td>36.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>46</td>\n",
       "      <td>description_tfidf19</td>\n",
       "      <td>35.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>47</td>\n",
       "      <td>description_tfidf3</td>\n",
       "      <td>35.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>48</td>\n",
       "      <td>description_tfidf35</td>\n",
       "      <td>35.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>49</td>\n",
       "      <td>description_tfidf8</td>\n",
       "      <td>33.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>50</td>\n",
       "      <td>requirements_tfidf0</td>\n",
       "      <td>31.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>51</td>\n",
       "      <td>company_profile_tfidf4</td>\n",
       "      <td>31.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>52</td>\n",
       "      <td>description_tfidf7</td>\n",
       "      <td>31.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>53</td>\n",
       "      <td>requirements_tfidf7</td>\n",
       "      <td>31.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>54</td>\n",
       "      <td>description_tfidf6</td>\n",
       "      <td>31.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>55</td>\n",
       "      <td>company_profile_tfidf17</td>\n",
       "      <td>31.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>56</td>\n",
       "      <td>requirements_tfidf17</td>\n",
       "      <td>30.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>57</td>\n",
       "      <td>description_tfidf11</td>\n",
       "      <td>30.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>58</td>\n",
       "      <td>requirements_tfidf9</td>\n",
       "      <td>29.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>59</td>\n",
       "      <td>company_profile_tfidf5</td>\n",
       "      <td>29.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>60</td>\n",
       "      <td>description_tfidf31</td>\n",
       "      <td>29.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>61</td>\n",
       "      <td>company_profile_tfidf14</td>\n",
       "      <td>28.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>62</td>\n",
       "      <td>description_tfidf2</td>\n",
       "      <td>28.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>63</td>\n",
       "      <td>description_tfidf39</td>\n",
       "      <td>26.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>64</td>\n",
       "      <td>has_questions</td>\n",
       "      <td>24.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>65</td>\n",
       "      <td>description_tfidf32</td>\n",
       "      <td>24.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>66</td>\n",
       "      <td>description_tfidf16</td>\n",
       "      <td>23.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>67</td>\n",
       "      <td>description_tfidf9</td>\n",
       "      <td>23.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>68</td>\n",
       "      <td>description_tfidf29</td>\n",
       "      <td>23.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>69</td>\n",
       "      <td>benefits_tfidf4</td>\n",
       "      <td>22.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>70</td>\n",
       "      <td>description_tfidf41</td>\n",
       "      <td>22.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>71</td>\n",
       "      <td>company_profile_tfidf15</td>\n",
       "      <td>22.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>72</td>\n",
       "      <td>requirements_tfidf11</td>\n",
       "      <td>22.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>73</td>\n",
       "      <td>company_profile_tfidf18</td>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>74</td>\n",
       "      <td>description_tfidf4</td>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>75</td>\n",
       "      <td>description_tfidf12</td>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>76</td>\n",
       "      <td>description_tfidf26</td>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>77</td>\n",
       "      <td>description_tfidf15</td>\n",
       "      <td>21.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>78</td>\n",
       "      <td>description_tfidf21</td>\n",
       "      <td>21.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>79</td>\n",
       "      <td>description_tfidf34</td>\n",
       "      <td>21.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>80</td>\n",
       "      <td>benefits_tfidf11</td>\n",
       "      <td>21.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>81</td>\n",
       "      <td>description_tfidf27</td>\n",
       "      <td>20.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>82</td>\n",
       "      <td>requirements_tfidf5</td>\n",
       "      <td>20.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>83</td>\n",
       "      <td>description_tfidf14</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>84</td>\n",
       "      <td>company_profile_tfidf21</td>\n",
       "      <td>17.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>85</td>\n",
       "      <td>requirements_tfidf15</td>\n",
       "      <td>17.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>86</td>\n",
       "      <td>benefits_tfidf7</td>\n",
       "      <td>17.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>87</td>\n",
       "      <td>company_profile_tfidf1</td>\n",
       "      <td>17.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>88</td>\n",
       "      <td>description_tfidf45</td>\n",
       "      <td>17.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>89</td>\n",
       "      <td>requirements_tfidf4</td>\n",
       "      <td>17.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>90</td>\n",
       "      <td>company_profile_tfidf13</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>91</td>\n",
       "      <td>description_tfidf33</td>\n",
       "      <td>16.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>92</td>\n",
       "      <td>requirements_tfidf10</td>\n",
       "      <td>16.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>93</td>\n",
       "      <td>description_tfidf20</td>\n",
       "      <td>16.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>94</td>\n",
       "      <td>requirements_tfidf12</td>\n",
       "      <td>16.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>95</td>\n",
       "      <td>description_tfidf43</td>\n",
       "      <td>15.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>96</td>\n",
       "      <td>requirements_tfidf14</td>\n",
       "      <td>15.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>97</td>\n",
       "      <td>requirements_tfidf18</td>\n",
       "      <td>15.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>98</td>\n",
       "      <td>benefits_tfidf8</td>\n",
       "      <td>15.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>99</td>\n",
       "      <td>company_profile_tfidf23</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>100</td>\n",
       "      <td>requirements_tfidf1</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>101</td>\n",
       "      <td>requirements_tfidf6</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>102</td>\n",
       "      <td>company_profile_tfidf20</td>\n",
       "      <td>14.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>103</td>\n",
       "      <td>description_tfidf10</td>\n",
       "      <td>14.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>104</td>\n",
       "      <td>benefits_tfidf10</td>\n",
       "      <td>13.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>105</td>\n",
       "      <td>company_profile_tfidf0</td>\n",
       "      <td>11.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>106</td>\n",
       "      <td>description_tfidf28</td>\n",
       "      <td>10.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>107</td>\n",
       "      <td>description_tfidf30</td>\n",
       "      <td>10.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>108</td>\n",
       "      <td>benefits_tfidf3</td>\n",
       "      <td>10.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>109</td>\n",
       "      <td>requirements_tfidf3</td>\n",
       "      <td>10.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>110</td>\n",
       "      <td>benefits_tfidf0</td>\n",
       "      <td>10.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>111</td>\n",
       "      <td>benefits_tfidf6</td>\n",
       "      <td>9.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>112</td>\n",
       "      <td>company_profile_tfidf11</td>\n",
       "      <td>8.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>113</td>\n",
       "      <td>company_profile_tfidf10</td>\n",
       "      <td>8.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>114</td>\n",
       "      <td>description_tfidf40</td>\n",
       "      <td>8.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>115</td>\n",
       "      <td>company_profile_tfidf19</td>\n",
       "      <td>7.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>116</td>\n",
       "      <td>company_profile_tfidf6</td>\n",
       "      <td>7.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>117</td>\n",
       "      <td>benefits_tfidf9</td>\n",
       "      <td>6.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>118</td>\n",
       "      <td>company_profile_tfidf22</td>\n",
       "      <td>4.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>119</td>\n",
       "      <td>company_profile_tfidf8</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>120</td>\n",
       "      <td>telecommuting</td>\n",
       "      <td>2.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>121</td>\n",
       "      <td>benefits_tfidf5</td>\n",
       "      <td>1.6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       column  importance\n",
       "0                    location       379.8\n",
       "1                    industry       332.6\n",
       "2        description_wordsLen       300.0\n",
       "3                       title       267.0\n",
       "4    company_profile_wordsLen       186.4\n",
       "5       requirements_wordsLen       184.8\n",
       "6                  department       165.2\n",
       "7                    function       144.4\n",
       "8           benefits_wordsLen       140.4\n",
       "9         description_tfidf46        91.4\n",
       "10         description_tfidf5        90.4\n",
       "11        description_tfidf18        89.4\n",
       "12        requirements_tfidf8        85.8\n",
       "13        required_experience        84.4\n",
       "14       requirements_tfidf16        74.6\n",
       "15     company_profile_tfidf7        74.2\n",
       "16        description_tfidf13        74.0\n",
       "17        description_tfidf42        73.6\n",
       "18        description_tfidf17        71.6\n",
       "19            benefits_tfidf1        70.8\n",
       "20         salary_range_start        70.6\n",
       "21    company_profile_tfidf16        70.2\n",
       "22         required_education        68.2\n",
       "23     company_profile_tfidf9        67.0\n",
       "24             title_wordsLen        65.4\n",
       "25        description_tfidf23        61.4\n",
       "26       requirements_tfidf13        61.2\n",
       "27       requirements_tfidf19        59.2\n",
       "28        description_tfidf44        58.2\n",
       "29        description_tfidf47        53.8\n",
       "30           salary_range_end        52.4\n",
       "31        requirements_tfidf2        48.4\n",
       "32        description_tfidf24        48.0\n",
       "33         description_tfidf0        48.0\n",
       "34        description_tfidf25        46.2\n",
       "35        description_tfidf36        45.4\n",
       "36            employment_type        45.2\n",
       "37        description_tfidf22        43.4\n",
       "38        description_tfidf37        40.4\n",
       "39     company_profile_tfidf2        40.0\n",
       "40        description_tfidf38        40.0\n",
       "41         description_tfidf1        40.0\n",
       "42    company_profile_tfidf12        38.6\n",
       "43           has_company_logo        37.8\n",
       "44            benefits_tfidf2        36.8\n",
       "45     company_profile_tfidf3        36.6\n",
       "46        description_tfidf19        35.8\n",
       "47         description_tfidf3        35.6\n",
       "48        description_tfidf35        35.4\n",
       "49         description_tfidf8        33.0\n",
       "50        requirements_tfidf0        31.6\n",
       "51     company_profile_tfidf4        31.6\n",
       "52         description_tfidf7        31.4\n",
       "53        requirements_tfidf7        31.4\n",
       "54         description_tfidf6        31.4\n",
       "55    company_profile_tfidf17        31.0\n",
       "56       requirements_tfidf17        30.8\n",
       "57        description_tfidf11        30.0\n",
       "58        requirements_tfidf9        29.6\n",
       "59     company_profile_tfidf5        29.2\n",
       "60        description_tfidf31        29.0\n",
       "61    company_profile_tfidf14        28.6\n",
       "62         description_tfidf2        28.2\n",
       "63        description_tfidf39        26.0\n",
       "64              has_questions        24.6\n",
       "65        description_tfidf32        24.6\n",
       "66        description_tfidf16        23.2\n",
       "67         description_tfidf9        23.2\n",
       "68        description_tfidf29        23.0\n",
       "69            benefits_tfidf4        22.6\n",
       "70        description_tfidf41        22.6\n",
       "71    company_profile_tfidf15        22.2\n",
       "72       requirements_tfidf11        22.2\n",
       "73    company_profile_tfidf18        22.0\n",
       "74         description_tfidf4        22.0\n",
       "75        description_tfidf12        22.0\n",
       "76        description_tfidf26        22.0\n",
       "77        description_tfidf15        21.8\n",
       "78        description_tfidf21        21.6\n",
       "79        description_tfidf34        21.4\n",
       "80           benefits_tfidf11        21.2\n",
       "81        description_tfidf27        20.8\n",
       "82        requirements_tfidf5        20.2\n",
       "83        description_tfidf14        20.0\n",
       "84    company_profile_tfidf21        17.6\n",
       "85       requirements_tfidf15        17.6\n",
       "86            benefits_tfidf7        17.6\n",
       "87     company_profile_tfidf1        17.4\n",
       "88        description_tfidf45        17.4\n",
       "89        requirements_tfidf4        17.2\n",
       "90    company_profile_tfidf13        17.0\n",
       "91        description_tfidf33        16.4\n",
       "92       requirements_tfidf10        16.4\n",
       "93        description_tfidf20        16.4\n",
       "94       requirements_tfidf12        16.0\n",
       "95        description_tfidf43        15.8\n",
       "96       requirements_tfidf14        15.8\n",
       "97       requirements_tfidf18        15.6\n",
       "98            benefits_tfidf8        15.2\n",
       "99    company_profile_tfidf23        15.0\n",
       "100       requirements_tfidf1        15.0\n",
       "101       requirements_tfidf6        15.0\n",
       "102   company_profile_tfidf20        14.8\n",
       "103       description_tfidf10        14.6\n",
       "104          benefits_tfidf10        13.2\n",
       "105    company_profile_tfidf0        11.6\n",
       "106       description_tfidf28        10.8\n",
       "107       description_tfidf30        10.8\n",
       "108           benefits_tfidf3        10.6\n",
       "109       requirements_tfidf3        10.6\n",
       "110           benefits_tfidf0        10.4\n",
       "111           benefits_tfidf6         9.4\n",
       "112   company_profile_tfidf11         8.8\n",
       "113   company_profile_tfidf10         8.4\n",
       "114       description_tfidf40         8.2\n",
       "115   company_profile_tfidf19         7.6\n",
       "116    company_profile_tfidf6         7.0\n",
       "117           benefits_tfidf9         6.2\n",
       "118   company_profile_tfidf22         4.8\n",
       "119    company_profile_tfidf8         3.0\n",
       "120             telecommuting         2.6\n",
       "121           benefits_tfidf5         1.6"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_importance = pd.concat(df_importance_list)\n",
    "df_importance = df_importance.groupby(['column'])['importance'].agg(\n",
    "    'mean').sort_values(ascending=False).reset_index()\n",
    "df_importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "auc: 0.9766968325791855\n"
     ]
    }
   ],
   "source": [
    "df_oof = pd.concat(oof)\n",
    "\n",
    "score = accuracy_score(df_oof[ycol].astype('int'), df_oof['pred'].astype('int'))\n",
    "print('auc:', score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    112\n",
       "1.0     72\n",
       "0.2      6\n",
       "0.8      5\n",
       "0.4      4\n",
       "0.6      1\n",
       "Name: fraudulent, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prediction.fraudulent.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    128\n",
      "1     72\n",
      "Name: fraudulent, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "sub = prediction.copy(deep=True)\n",
    "sub['fraudulent'] = sub['fraudulent'].apply(lambda x: 1 if x==1 else 0)\n",
    "\n",
    "print(sub.fraudulent.value_counts())\n",
    "\n",
    "sub.to_csv('submissions/{}.csv'.format(score), index=False, header=False, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
